import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('/users/mrocchio/desktop/moneyball.csv')
df
| INDEX | TARGET_WINS | TEAM_BATTING_H | TEAM_BATTING_2B | TEAM_BATTING_3B | TEAM_BATTING_HR | TEAM_BATTING_BB | TEAM_BATTING_SO | TEAM_BASERUN_SB | TEAM_BASERUN_CS | TEAM_BATTING_HBP | TEAM_PITCHING_H | TEAM_PITCHING_HR | TEAM_PITCHING_BB | TEAM_PITCHING_SO | TEAM_FIELDING_E | TEAM_FIELDING_DP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 39 | 1445 | 194 | 39 | 13 | 143 | 842 | 9364 | 84 | 927 | 5456 | 1011 | ||||
| 1 | 2 | 70 | 1339 | 219 | 22 | 190 | 685 | 1075 | 37 | 28 | 1347 | 191 | 689 | 1082 | 193 | 155 | |
| 2 | 3 | 86 | 1377 | 232 | 35 | 137 | 602 | 917 | 46 | 27 | 1377 | 137 | 602 | 917 | 175 | 153 | |
| 3 | 4 | 70 | 1387 | 209 | 38 | 96 | 451 | 922 | 43 | 30 | 1396 | 97 | 454 | 928 | 164 | 156 | |
| 4 | 5 | 82 | 1297 | 186 | 27 | 102 | 472 | 920 | 49 | 39 | 1297 | 102 | 472 | 920 | 138 | 168 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2271 | 2531 | 83 | 1404 | 294 | 25 | 144 | 522 | 990 | 100 | 39 | 45 | 1404 | 144 | 522 | 990 | 102 | 152 |
| 2272 | 2532 | 67 | 1361 | 276 | 27 | 151 | 496 | 925 | 109 | 38 | 35 | 1361 | 151 | 496 | 925 | 99 | 171 |
| 2273 | 2533 | 81 | 1367 | 311 | 32 | 117 | 491 | 1090 | 45 | 45 | 89 | 1367 | 117 | 491 | 1090 | 92 | 156 |
| 2274 | 2534 | 71 | 1437 | 322 | 22 | 164 | 594 | 1156 | 123 | 62 | 69 | 1437 | 164 | 594 | 1156 | 131 | 123 |
| 2275 | 2535 | 31 | 1116 | 157 | 62 | 15 | 262 | 969 | 2870 | 39 | 674 | 2492 | 1026 |
2276 rows × 17 columns
empty=df
for i in empty.columns:
empty[i]=empty[i].apply(
lambda x:None if x==' ' else x)
fig, ax = plt.subplots(figsize=(18,15))
ax=sns.heatmap(empty.isnull().T)
plt.show()
## upon review it is clear that a blank field indicates that the value is zero
for i in df.columns:
df[i]=df[i].apply(
lambda x:'0' if x==' ' else x).fillna(0)
df[i]=pd.to_numeric(df[i])
df
| INDEX | TARGET_WINS | TEAM_BATTING_H | TEAM_BATTING_2B | TEAM_BATTING_3B | TEAM_BATTING_HR | TEAM_BATTING_BB | TEAM_BATTING_SO | TEAM_BASERUN_SB | TEAM_BASERUN_CS | TEAM_BATTING_HBP | TEAM_PITCHING_H | TEAM_PITCHING_HR | TEAM_PITCHING_BB | TEAM_PITCHING_SO | TEAM_FIELDING_E | TEAM_FIELDING_DP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 39 | 1445 | 194 | 39 | 13 | 143 | 842 | 0 | 0 | 0 | 9364 | 84 | 927 | 5456 | 1011 | 0 |
| 1 | 2 | 70 | 1339 | 219 | 22 | 190 | 685 | 1075 | 37 | 28 | 0 | 1347 | 191 | 689 | 1082 | 193 | 155 |
| 2 | 3 | 86 | 1377 | 232 | 35 | 137 | 602 | 917 | 46 | 27 | 0 | 1377 | 137 | 602 | 917 | 175 | 153 |
| 3 | 4 | 70 | 1387 | 209 | 38 | 96 | 451 | 922 | 43 | 30 | 0 | 1396 | 97 | 454 | 928 | 164 | 156 |
| 4 | 5 | 82 | 1297 | 186 | 27 | 102 | 472 | 920 | 49 | 39 | 0 | 1297 | 102 | 472 | 920 | 138 | 168 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2271 | 2531 | 83 | 1404 | 294 | 25 | 144 | 522 | 990 | 100 | 39 | 45 | 1404 | 144 | 522 | 990 | 102 | 152 |
| 2272 | 2532 | 67 | 1361 | 276 | 27 | 151 | 496 | 925 | 109 | 38 | 35 | 1361 | 151 | 496 | 925 | 99 | 171 |
| 2273 | 2533 | 81 | 1367 | 311 | 32 | 117 | 491 | 1090 | 45 | 45 | 89 | 1367 | 117 | 491 | 1090 | 92 | 156 |
| 2274 | 2534 | 71 | 1437 | 322 | 22 | 164 | 594 | 1156 | 123 | 62 | 69 | 1437 | 164 | 594 | 1156 | 131 | 123 |
| 2275 | 2535 | 31 | 1116 | 157 | 62 | 15 | 262 | 969 | 0 | 0 | 0 | 2870 | 39 | 674 | 2492 | 1026 | 0 |
2276 rows × 17 columns
corr = df.corr()
fig, ax = plt.subplots(figsize=(15,15))
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right')
[Text(0.5, 0, 'INDEX'), Text(1.5, 0, 'TARGET_WINS'), Text(2.5, 0, 'TEAM_BATTING_H'), Text(3.5, 0, 'TEAM_BATTING_2B'), Text(4.5, 0, 'TEAM_BATTING_3B'), Text(5.5, 0, 'TEAM_BATTING_HR'), Text(6.5, 0, 'TEAM_BATTING_BB'), Text(7.5, 0, 'TEAM_BATTING_SO'), Text(8.5, 0, 'TEAM_BASERUN_SB'), Text(9.5, 0, 'TEAM_BASERUN_CS'), Text(10.5, 0, 'TEAM_BATTING_HBP'), Text(11.5, 0, 'TEAM_PITCHING_H'), Text(12.5, 0, 'TEAM_PITCHING_HR'), Text(13.5, 0, 'TEAM_PITCHING_BB'), Text(14.5, 0, 'TEAM_PITCHING_SO'), Text(15.5, 0, 'TEAM_FIELDING_E'), Text(16.5, 0, 'TEAM_FIELDING_DP')]
import warnings
warnings.filterwarnings("ignore")
sns.pairplot(df, hue = 'TARGET_WINS', palette="hls", vars=['TEAM_BATTING_H', 'TEAM_BATTING_2B', 'TEAM_BATTING_3B', 'TEAM_BATTING_HR', 'TEAM_BATTING_BB', 'TEAM_BATTING_HBP', 'TEAM_BASERUN_SB', 'TEAM_FIELDING_DP', 'TEAM_PITCHING_SO' ])
<seaborn.axisgrid.PairGrid at 0x7ffcc2a24dc0>
sns.pairplot(df, hue = 'TARGET_WINS', palette="hls", vars=['TEAM_BATTING_SO', 'TEAM_BASERUN_SB', 'TEAM_BASERUN_CS', 'TEAM_FIELDING_E', 'TEAM_FIELDING_DP', 'TEAM_PITCHING_BB', 'TEAM_PITCHING_H', 'TEAM_PITCHING_HR' ])
<seaborn.axisgrid.PairGrid at 0x7ffcc33ddca0>
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
random_state = 0
features=['TEAM_BATTING_H', 'TEAM_BATTING_2B', 'TEAM_BATTING_3B', 'TEAM_BATTING_HR', 'TEAM_BATTING_BB', 'TEAM_BATTING_SO', 'TEAM_BASERUN_SB', 'TEAM_BASERUN_CS', 'TEAM_BATTING_HBP', 'TEAM_PITCHING_H', 'TEAM_PITCHING_HR', 'TEAM_PITCHING_BB', 'TEAM_PITCHING_SO', 'TEAM_FIELDING_E', 'TEAM_FIELDING_DP']
x=df[features].to_numpy()
y=df['TARGET_WINS'].to_numpy()
y_scale=MinMaxScaler()
y=y_scale.fit_transform(y.reshape(-1, 1))
x_scale=StandardScaler()
x=x_scale.fit_transform(x)
x
array([[-0.16788785, -1.009741 , -0.58176129, ..., 8.25534726,
3.35726424, -2.35434125],
[-0.9011503 , -0.47545173, -1.19037311, ..., 0.53138283,
-0.23485179, 0.49677544],
[-0.63828263, -0.1976213 , -0.72496407, ..., 0.24001243,
-0.31389591, 0.45998684],
...,
[-0.70745833, 1.49073281, -0.83236616, ..., 0.54550988,
-0.67837712, 0.51516975],
[-0.22322841, 1.72582009, -1.19037311, ..., 0.66205804,
-0.50711486, -0.0918422 ],
[-2.44376847, -1.80048913, 0.24165469, ..., 3.02127534,
3.42313433, -2.35434125]])
from sklearn.decomposition import PCA
random_state = 0
pca2 = PCA(n_components=2)
principalComponents = pca2.fit_transform(x,[y])
PC_df2 = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])
PC_df2['y']=y
PC_df2
| PC1 | PC2 | y | |
|---|---|---|---|
| 0 | 2.965460 | 6.628874 | 0.267123 |
| 1 | -2.775733 | 0.023033 | 0.479452 |
| 2 | -1.535350 | -0.337066 | 0.589041 |
| 3 | -0.585530 | -1.071006 | 0.479452 |
| 4 | -0.973641 | -1.608303 | 0.561644 |
| ... | ... | ... | ... |
| 2271 | -2.401186 | 0.486016 | 0.568493 |
| 2272 | -2.257227 | -0.025549 | 0.458904 |
| 2273 | -2.762578 | 0.782304 | 0.554795 |
| 2274 | -3.348040 | 1.537721 | 0.486301 |
| 2275 | 3.030167 | 0.899227 | 0.212329 |
2276 rows × 3 columns
fig, ax = plt.subplots(figsize=(10,10))
sns.set_context("talk")
sns.scatterplot(x="PC1", y="PC2", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df2, alpha=1, ax=ax)
<AxesSubplot:xlabel='PC1', ylabel='PC2'>
random_state = 0
pca3 = PCA(n_components=3)
principalComponents = pca3.fit_transform(x,[y])
PC_df3 = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2', 'PC3'])
PC_df3['y']=y
PC_df3
| PC1 | PC2 | PC3 | y | |
|---|---|---|---|---|
| 0 | 2.965460 | 6.628893 | 8.379521 | 0.267123 |
| 1 | -2.775734 | 0.023032 | 0.966064 | 0.479452 |
| 2 | -1.535350 | -0.337065 | 0.602100 | 0.589041 |
| 3 | -0.585530 | -1.071002 | 1.153818 | 0.479452 |
| 4 | -0.973641 | -1.608301 | 1.582035 | 0.561644 |
| ... | ... | ... | ... | ... |
| 2271 | -2.401186 | 0.486015 | 0.210472 | 0.568493 |
| 2272 | -2.257227 | -0.025554 | 0.368685 | 0.458904 |
| 2273 | -2.762578 | 0.782304 | 0.515692 | 0.554795 |
| 2274 | -3.348040 | 1.537721 | -0.064478 | 0.486301 |
| 2275 | 3.030167 | 0.899212 | 5.419115 | 0.212329 |
2276 rows × 4 columns
fig, (ax, ax1, ax2) = plt.subplots(1,3, figsize=(35,12))
sns.set_context("talk")
sns.scatterplot(x="PC1", y="PC2", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df3, alpha=1, ax=ax)
sns.scatterplot(x="PC1", y="PC3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df3, alpha=1, ax=ax1)
sns.scatterplot(x="PC2", y="PC3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df3, alpha=1, ax=ax2)
<AxesSubplot:xlabel='PC2', ylabel='PC3'>
random_state = 0
pca4 = PCA(n_components=4)
principalComponents = pca4.fit_transform(x,[y])
PC_df4 = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2', 'PC3', 'PC4'])
principalComponents
PC_df4['y']=y
PC_df4
| PC1 | PC2 | PC3 | PC4 | y | |
|---|---|---|---|---|---|
| 0 | 2.965460 | 6.628893 | 8.379542 | 1.283633 | 0.267123 |
| 1 | -2.775734 | 0.023032 | 0.966065 | 0.932072 | 0.479452 |
| 2 | -1.535350 | -0.337065 | 0.602101 | 0.358724 | 0.589041 |
| 3 | -0.585530 | -1.071002 | 1.153816 | -0.455177 | 0.479452 |
| 4 | -0.973641 | -1.608301 | 1.582033 | -0.063971 | 0.561644 |
| ... | ... | ... | ... | ... | ... |
| 2271 | -2.401186 | 0.486015 | 0.210472 | -0.806942 | 0.568493 |
| 2272 | -2.257227 | -0.025554 | 0.368683 | -0.734017 | 0.458904 |
| 2273 | -2.762578 | 0.782304 | 0.515691 | -1.740393 | 0.554795 |
| 2274 | -3.348040 | 1.537721 | -0.064477 | -0.608759 | 0.486301 |
| 2275 | 3.030167 | 0.899212 | 5.419104 | 0.923312 | 0.212329 |
2276 rows × 5 columns
fig, axes = plt.subplots(2,3, figsize=(35,36))
sns.set_context("talk")
sns.scatterplot(x="PC1", y="PC2", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df4, alpha=1, ax=axes[0,0])
sns.scatterplot(x="PC1", y="PC3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df4, alpha=1, ax=axes[0,1])
sns.scatterplot(x="PC1", y="PC4", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df4, alpha=1, ax=axes[0,2])
sns.scatterplot(x="PC2", y="PC3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df4, alpha=1, ax=axes[1,0])
sns.scatterplot(x="PC2", y="PC4", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df4, alpha=1, ax=axes[1,1])
sns.scatterplot(x="PC3", y="PC4", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df4, alpha=1, ax=axes[1,2])
<AxesSubplot:xlabel='PC3', ylabel='PC4'>
print('2 Component PCA Var Ratio: ', sum(pca2.explained_variance_ratio_), pca2.explained_variance_ratio_)
print('3 Component PCA Var Ratio: ', sum(pca3.explained_variance_ratio_), pca3.explained_variance_ratio_)
print('4 Component PCA Var Ratio: ', sum(pca4.explained_variance_ratio_), pca4.explained_variance_ratio_)
print('it looks like 6 components will get us to the desired 80 percent variance')
2 Component PCA Var Ratio: 0.5050430333881942 [0.36019814 0.14484489] 3 Component PCA Var Ratio: 0.6170544841791186 [0.36019814 0.14484489 0.11201145] 4 Component PCA Var Ratio: 0.7201476518595558 [0.36019814 0.14484489 0.11201145 0.10309317] it looks like 6 components will get us to the desired 80 percent variance
random_state = 0
pca6 = PCA(n_components=6)
principalComponents = pca6.fit_transform(x,[y])
PC_df6 = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'])
PC_df6['y']=y
PC_df6
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | y | |
|---|---|---|---|---|---|---|---|
| 0 | 2.965460 | 6.628893 | 8.379542 | 1.283633 | 0.588897 | 0.363855 | 0.267123 |
| 1 | -2.775734 | 0.023032 | 0.966065 | 0.932072 | 0.071403 | -1.220137 | 0.479452 |
| 2 | -1.535350 | -0.337065 | 0.602101 | 0.358724 | 0.218951 | -0.873948 | 0.589041 |
| 3 | -0.585530 | -1.071002 | 1.153816 | -0.455177 | 0.164921 | -0.259081 | 0.479452 |
| 4 | -0.973641 | -1.608301 | 1.582033 | -0.063971 | 0.214123 | -0.207638 | 0.561644 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2271 | -2.401186 | 0.486015 | 0.210472 | -0.806942 | -1.710270 | 0.190278 | 0.568493 |
| 2272 | -2.257227 | -0.025554 | 0.368683 | -0.734017 | -1.321389 | 0.082418 | 0.458904 |
| 2273 | -2.762578 | 0.782304 | 0.515691 | -1.740393 | -3.182891 | 0.540933 | 0.554795 |
| 2274 | -3.348040 | 1.537721 | -0.064477 | -0.608759 | -2.669348 | 0.965868 | 0.486301 |
| 2275 | 3.030167 | 0.899212 | 5.419104 | 0.923312 | -1.067332 | -0.169682 | 0.212329 |
2276 rows × 7 columns
fig, axes = plt.subplots(5,3, figsize=(35,75))
sns.set_context("talk")
sns.scatterplot(x="PC1", y="PC2", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[0,0])
sns.scatterplot(x="PC1", y="PC3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[0,1])
sns.scatterplot(x="PC1", y="PC4", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[0,2])
sns.scatterplot(x="PC1", y="PC5", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[1,0])
sns.scatterplot(x="PC1", y="PC6", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[1,1])
sns.scatterplot(x="PC2", y="PC3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[1,2])
sns.scatterplot(x="PC2", y="PC4", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[2,0])
sns.scatterplot(x="PC2", y="PC5", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[2,1])
sns.scatterplot(x="PC2", y="PC6", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[2,2])
sns.scatterplot(x="PC3", y="PC4", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[3,0])
sns.scatterplot(x="PC3", y="PC5", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[3,1])
sns.scatterplot(x="PC3", y="PC6", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[3,2])
sns.scatterplot(x="PC4", y="PC5", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[4,0])
sns.scatterplot(x="PC4", y="PC6", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[4,1])
sns.scatterplot(x="PC5", y="PC6", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=PC_df6, alpha=1, ax=axes[4,2])
<AxesSubplot:xlabel='PC5', ylabel='PC6'>
print('6 Component PCA Var Ratio: ', sum(pca6.explained_variance_ratio_), pca6.explained_variance_ratio_)
varrate=pd.DataFrame({
'clusters': [6,5,4,3,2,1],
'explained_variance_ratio_node': pca6.explained_variance_ratio_
}).sort_values(by='clusters')
varrate['explained_variance_ratio']= varrate['explained_variance_ratio_node'].cumsum()
fig, ax = plt.subplots(figsize=(15,15))
ax.scatter(x=varrate['clusters'], y=varrate['explained_variance_ratio'])
plt.grid(color='b', linestyle='-', linewidth=1)
plt.xlabel('Clusters')
plt.ylabel('Explained_Variance_Ratio')
plt.show()
6 Component PCA Var Ratio: 0.843587221750662 [0.36019814 0.14484489 0.11201145 0.10309317 0.07062241 0.05281716]
from sklearn.manifold import TSNE
tsne2 = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=10000)
tsne_results2 = tsne2.fit_transform(df[features], y=y)
[t-SNE] Computing 151 nearest neighbors... [t-SNE] Indexed 2276 samples in 0.009s... [t-SNE] Computed neighbors for 2276 samples in 0.094s... [t-SNE] Computed conditional probabilities for sample 1000 / 2276 [t-SNE] Computed conditional probabilities for sample 2000 / 2276 [t-SNE] Computed conditional probabilities for sample 2276 / 2276 [t-SNE] Mean sigma: 57.463743 [t-SNE] KL divergence after 250 iterations with early exaggeration: 63.579750 [t-SNE] KL divergence after 10000 iterations: 0.837268
tsne_results2[:,0].flatten()
array([-19.671543, 37.34426 , 28.586668, ..., 48.72317 , 59.647373,
-21.608988], dtype=float32)
tsne_df2=pd.DataFrame({
'tsne-2d-1': tsne_results2[:,0].flatten(),
'tsne-2d-2': tsne_results2[:,1].flatten()})
tsne_df2['y']=y
fig, ax = plt.subplots(figsize=(15,15))
sns.scatterplot(x="tsne-2d-1", y="tsne-2d-2", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=tsne_df2, alpha=1, ax=ax)
plt.show()
tsne3 = TSNE(n_components=3, verbose=1, perplexity=50, n_iter=10000)
tsne_results3 = tsne3.fit_transform(df[features], y=y)
tsne_results3
[t-SNE] Computing 151 nearest neighbors... [t-SNE] Indexed 2276 samples in 0.002s... [t-SNE] Computed neighbors for 2276 samples in 0.087s... [t-SNE] Computed conditional probabilities for sample 1000 / 2276 [t-SNE] Computed conditional probabilities for sample 2000 / 2276 [t-SNE] Computed conditional probabilities for sample 2276 / 2276 [t-SNE] Mean sigma: 57.463743 [t-SNE] KL divergence after 250 iterations with early exaggeration: 63.857140 [t-SNE] KL divergence after 2950 iterations: 0.690854
array([[ -8.029726 , 4.4180837 , -16.634611 ],
[ 0.6657408 , 4.9065495 , 11.653527 ],
[ 1.7764912 , 1.4574441 , 8.25554 ],
...,
[ 0.15063517, 10.072702 , 8.840198 ],
[ -3.7298284 , 8.860112 , 9.813134 ],
[ -4.070292 , 2.617965 , -13.673218 ]], dtype=float32)
tsne_df3=pd.DataFrame({
'tsne-3d-1': tsne_results3[:,0].flatten(),
'tsne-3d-2': tsne_results3[:,1].flatten(),
'tsne-3d-3': tsne_results3[:,2].flatten()})
tsne_df3['y']=y
fig, (ax, ax1, ax2) = plt.subplots(1,3,figsize=(30,15))
sns.scatterplot(x="tsne-3d-1", y="tsne-3d-2", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=tsne_df3, alpha=1, ax=ax)
sns.scatterplot(x="tsne-3d-1", y="tsne-3d-3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=tsne_df3, alpha=1, ax=ax1)
sns.scatterplot(x="tsne-3d-2", y="tsne-3d-3", hue="y", palette=sns.color_palette("hls", as_cmap=True), data=tsne_df3, alpha=1, ax=ax2)
plt.show()
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
y=y.flatten()
linear_data=PC_df6
# del linear_data['y']
X=linear_data.to_numpy()
regr = linear_model.LinearRegression(normalize=True)
regr.fit(X, y)
y_pred = regr.predict(X)
# Cross-validation
y_cv = cross_val_predict(regr, X, y, cv=10)
# Calculate scores for calibration and cross-validation
score_c = r2_score(y, y_pred)
score_cv = r2_score(y, y_cv)
# Calculate mean square error for calibration and cross validation
mse_c = mean_squared_error(y, y_pred)
mse_cv = mean_squared_error(y, y_cv)
print('score_c:', score_c)
print('score_cv:', score_cv)
print('mse_c:', mse_c)
print('mse_cv:', mse_cv)
score_c: 1.0 score_cv: 1.0 mse_c: 1.1890712239236564e-32 mse_cv: 5.348210872595784e-30
y_scale.inverse_transform(y.reshape(1, -1))
array([[39., 70., 86., ..., 81., 71., 31.]])
final=pd.DataFrame({
'y': y.flatten(),
'y_pred':y_pred,
'y_tran':y_scale.inverse_transform(y.reshape(-1, 1)).flatten(),
'y_pred_tran':y_scale.inverse_transform(y_pred.reshape(-1, 1)).flatten()
})
fig, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x="y_tran", y="y_pred_tran", data=final, alpha=1, ax=ax)
# plt.Line2D(y=x)
plt.show()
fig, ax = plt.subplots(figsize=(10,10))
sns.regplot(x="y_tran", y="y_pred_tran", data=final, ax=ax, scatter_kws={"color": "blue"}, line_kws={"color": "red"})
# plt.Line2D(y=x)
plt.show()